Code
library(tidymodels)
library(tidyverse)
library(janitor)
library(naniar)
library(assertr)
library(corrplot)
library(gridExtra)
## Turn off scientific notation for readable numbers
options(scipen = 999)library(tidymodels)
library(tidyverse)
library(janitor)
library(naniar)
library(assertr)
library(corrplot)
library(gridExtra)
## Turn off scientific notation for readable numbers
options(scipen = 999)Filler out Missing Values in our key predicted outcome as this will cause our predicted models to fail, it provides no additional data to our model. However, for missingness in our predictors, imputation will help ensure non-systemic missingness adversely affect the sample size of our model.
remittances <- read_csv("../data/remittances.csv") |>
clean_names() |>
filter(!is.na(remittances_gdp)) |>
arrange(country_name, year) |>
group_by(country_name) |>
mutate(
gdp_lag = lag(gdp_per),
deportations_lag = lag(deportations)
) |>
ungroup()## Set seed for reproducibility
set.seed(20251211)
## Split data: 80% training, 20% testing
remit_split <- initial_split(data = remittances, prop = 0.8)
remit_train <- training(x = remit_split)
remit_test <- testing(x = remit_split)## View the first few rows and column types
glimpse(remit_train)Rows: 3,292
Columns: 21
$ country_name <chr> "Finland", "Malawi", "Jordan", "Canada", "Ukraine",…
$ year <dbl> 1994, 2023, 2014, 2022, 2021, 1996, 1996, 2001, 201…
$ remittances <dbl> 66893753, 179747923, 6369718310, 863836976, 1806000…
$ remittances_gdp <dbl> 0.06483040, 1.41398522, 17.28663681, 0.03943721, 9.…
$ country_code <chr> "FIN", "MWI", "JOR", "CAN", "UKR", "EST", "SVK", "T…
$ gdp <dbl> 103182697780, 12712150082, 36847643521, 21904110801…
$ stock <dbl> NA, NA, 172250.87, NA, 407204.58, NA, NA, 167650.92…
$ unemployment <dbl> 16.426, 5.033, 11.900, 5.279, 9.834, 9.915, 11.341,…
$ gdp_per <dbl> 20278.2911, 602.3436, 4191.1805, 56256.8007, 4775.9…
$ inflation <dbl> 1.84497581, 22.80672202, 3.44532356, 7.90676213, 24…
$ vulnerable_emp <dbl> 12.8965222, 60.5104917, 9.1662623, 9.7743867, 17.91…
$ maternal_mortality <dbl> 7, 225, 43, 15, 26, 36, 10, 47, NA, 28, 10, 7, 1492…
$ exchange_rate <dbl> 5.2235125, 1161.0943670, 0.7100000, 1.3015548, 27.2…
$ deportations <dbl> NA, 2, 81, 97, 56, NA, NA, 98, 0, 17, 47, 6, 268, 0…
$ internet <dbl> 4.920000, 17.988100, 46.200000, 94.000000, 79.21830…
$ poverty <dbl> NA, NA, NA, NA, NA, NA, 1.7, NA, NA, NA, NA, 4.6, N…
$ dist_pop <dbl> 6625.9230, 12519.8200, 9209.4960, 548.3946, 7515.91…
$ dist_cap <dbl> 6943.1820, 12780.8200, 9540.5230, 737.0425, 7842.33…
$ terror <dbl> 1, 2, 3, 2, 3, 2, NA, 2, NA, 3, 1, 2, 3, 1, 4, 1, 1…
$ gdp_lag <dbl> NA, 604.2698, 4311.2192, 52886.6616, 3709.7693, 313…
$ deportations_lag <dbl> NA, 6, 100, 69, 151, NA, NA, 85, 0, 15, NA, 2, 211,…
The training data has 3,918 rows and 19 columns. We can see:
## Get min, max, mean, median for all variables
summary(remit_train) country_name year remittances remittances_gdp
Length:3292 Min. :1994 Min. : 6038 Min. : 0.000029
Class :character 1st Qu.:2002 1st Qu.: 70913036 1st Qu.: 0.370960
Mode :character Median :2010 Median : 532205864 Median : 1.651116
Mean :2010 Mean : 2659137842 Mean : 4.047375
3rd Qu.:2017 3rd Qu.: 1989312698 3rd Qu.: 4.654784
Max. :2024 Max. :137674533896 Max. :128.977537
country_code gdp stock
Length:3292 Min. : 37184925 Min. : 259.6
Class :character 1st Qu.: 6202697120 1st Qu.: 39220.7
Mode :character Median : 25470203093 Median : 95864.5
Mean : 307632024591 Mean : 390000.5
3rd Qu.: 169292646245 3rd Qu.: 257576.1
Max. :18743803170800 Max. :23126089.8
NA's :1453
unemployment gdp_per inflation vulnerable_emp
Min. : 0.100 Min. : 109.6 Min. : -32.741 Min. : 0.1257
1st Qu.: 3.913 1st Qu.: 1389.2 1st Qu.: 1.782 1st Qu.:13.7339
Median : 6.345 Median : 4352.4 Median : 4.144 Median :33.0105
Mean : 7.907 Mean : 12207.3 Mean : 10.368 Mean :38.5949
3rd Qu.:10.610 3rd Qu.: 14268.8 3rd Qu.: 8.567 3rd Qu.:60.8727
Max. :34.007 Max. :138935.0 Max. :2240.169 Max. :93.9912
NA's :178 NA's :32 NA's :274
maternal_mortality exchange_rate deportations internet
Min. : 1.0 Min. : 0.0012 Min. : 0.0 Min. : 0.000
1st Qu.: 14.0 1st Qu.: 1.6933 1st Qu.: 7.0 1st Qu.: 3.394
Median : 60.0 Median : 8.2747 Median : 33.0 Median : 23.000
Mean : 168.9 Mean : 378.4053 Mean : 919.5 Mean : 34.089
3rd Qu.: 218.2 3rd Qu.: 115.5477 3rd Qu.: 129.0 3rd Qu.: 64.115
Max. :5721.0 Max. :15855.4483 Max. :90504.0 Max. :100.000
NA's :152 NA's :38 NA's :372 NA's :230
poverty dist_pop dist_cap terror
Min. : 0.00 Min. : 548.4 Min. : 737 Min. :1.000
1st Qu.: 0.30 1st Qu.: 6035.3 1st Qu.: 6274 1st Qu.:1.000
Median : 1.50 Median : 7929.3 Median : 8260 Median :2.000
Mean :10.24 Mean : 8538.5 Mean : 8685 Mean :2.329
3rd Qu.:10.90 3rd Qu.:11720.2 3rd Qu.:11685 3rd Qu.:3.000
Max. :89.40 Max. :16180.3 Max. :16371 Max. :5.000
NA's :1923 NA's :146 NA's :146 NA's :139
gdp_lag deportations_lag
Min. : 109.6 Min. : 0.0
1st Qu.: 1370.8 1st Qu.: 7.0
Median : 4229.9 Median : 33.0
Mean : 11993.0 Mean : 883.9
3rd Qu.: 14065.6 3rd Qu.: 128.0
Max. :132604.4 Max. :90504.0
NA's :121 NA's :491
Key observations:
## Detailed structure of the data
str(remit_train)tibble [3,292 × 21] (S3: tbl_df/tbl/data.frame)
$ country_name : chr [1:3292] "Finland" "Malawi" "Jordan" "Canada" ...
$ year : num [1:3292] 1994 2023 2014 2022 2021 ...
$ remittances : num [1:3292] 66893753 179747923 6369718310 863836976 18060000000 ...
$ remittances_gdp : num [1:3292] 0.0648 1.414 17.2866 0.0394 9.0406 ...
$ country_code : chr [1:3292] "FIN" "MWI" "JOR" "CAN" ...
$ gdp : num [1:3292] 103182697780 12712150082 36847643521 2190411080134 199765859571 ...
$ stock : num [1:3292] NA NA 172251 NA 407205 ...
$ unemployment : num [1:3292] 16.43 5.03 11.9 5.28 9.83 ...
$ gdp_per : num [1:3292] 20278 602 4191 56257 4776 ...
$ inflation : num [1:3292] 1.84 22.81 3.45 7.91 24.8 ...
$ vulnerable_emp : num [1:3292] 12.9 60.51 9.17 9.77 17.91 ...
$ maternal_mortality: num [1:3292] 7 225 43 15 26 36 10 47 NA 28 ...
$ exchange_rate : num [1:3292] 5.22 1161.09 0.71 1.3 27.29 ...
$ deportations : num [1:3292] NA 2 81 97 56 NA NA 98 0 17 ...
$ internet : num [1:3292] 4.92 17.99 46.2 94 79.22 ...
$ poverty : num [1:3292] NA NA NA NA NA NA 1.7 NA NA NA ...
$ dist_pop : num [1:3292] 6626 12520 9209 548 7516 ...
$ dist_cap : num [1:3292] 6943 12781 9541 737 7842 ...
$ terror : num [1:3292] 1 2 3 2 3 2 NA 2 NA 3 ...
$ gdp_lag : num [1:3292] NA 604 4311 52887 3710 ...
$ deportations_lag : num [1:3292] NA 6 100 69 151 NA NA 85 0 15 ...
All numeric variables are stored as num or dbl (double precision), and text variables are stored as chr (character).
## Check current column names
names(remit_train) [1] "country_name" "year" "remittances"
[4] "remittances_gdp" "country_code" "gdp"
[7] "stock" "unemployment" "gdp_per"
[10] "inflation" "vulnerable_emp" "maternal_mortality"
[13] "exchange_rate" "deportations" "internet"
[16] "poverty" "dist_pop" "dist_cap"
[19] "terror" "gdp_lag" "deportations_lag"
Some column names have spaces and capital letters.
## Convert to lowercase with underscores
remit_train <- remit_train |>
clean_names()
## Verify the cleaned names
names(remit_train) [1] "country_name" "year" "remittances"
[4] "remittances_gdp" "country_code" "gdp"
[7] "stock" "unemployment" "gdp_per"
[10] "inflation" "vulnerable_emp" "maternal_mortality"
[13] "exchange_rate" "deportations" "internet"
[16] "poverty" "dist_pop" "dist_cap"
[19] "terror" "gdp_lag" "deportations_lag"
Now all column names are lowercase with underscores.
## Count NA values in each column
colSums(is.na(remit_train)) country_name year remittances remittances_gdp
0 0 0 0
country_code gdp stock unemployment
0 0 1453 178
gdp_per inflation vulnerable_emp maternal_mortality
0 32 274 152
exchange_rate deportations internet poverty
38 372 230 1923
dist_pop dist_cap terror gdp_lag
146 146 139 121
deportations_lag
491
Several variables have missing data. Let’s calculate the percentage!
## Calculate percent missing for each variable
remit_train |>
summarise(across(everything(), ~sum(is.na(.)) / n() * 100))# A tibble: 1 × 21
country_name year remittances remittances_gdp country_code gdp stock
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 0 0 0 0 0 44.1
# ℹ 14 more variables: unemployment <dbl>, gdp_per <dbl>, inflation <dbl>,
# vulnerable_emp <dbl>, maternal_mortality <dbl>, exchange_rate <dbl>,
# deportations <dbl>, internet <dbl>, poverty <dbl>, dist_pop <dbl>,
# dist_cap <dbl>, terror <dbl>, gdp_lag <dbl>, deportations_lag <dbl>
Major findings:
## Get min, max, mean, median, quartiles
summary(remit_train$remittances) Min. 1st Qu. Median Mean 3rd Qu. Max.
6038 70913036 532205864 2659137842 1989312698 137674533896
The mean ($2.55 billion) is much larger than the median ($529 million). The data is right-skewed.
## Create a point plot (from class notes Section 5.6.1)
remit_train |>
select(remittances, remittances_gdp, gdp, stock, unemployment,deportations) |>
pivot_longer(everything()) |>
ggplot(aes(value)) +
geom_histogram(bins = 30) +
facet_wrap(~name, scales = "free") +
theme_minimal()Warning: Removed 2003 rows containing non-finite outside the scale range
(`stat_bin()`).
#CAVEAT: I could not 'clean' the x axis (log would be the way to solve it but
#we want to show how skewed the distribution is)## Create a point plot (from class notes Section 5.6.1)
remit_train |>
ggplot(aes(remittances, 1)) +
geom_point(alpha = 0.2) +
scale_y_continuous(breaks = 0) +
labs(y = NULL, title = "Distribution of Remittances") +
theme_bw() +
theme(panel.border = element_blank())Most points cluster on the left (lower values) with a few extreme points on the right (confirms right-skewness).
## Create histogram with 30 bins
remit_train |>
ggplot(aes(x = remittances)) +
geom_histogram(bins = 30, fill = "steelblue") +
theme_minimal() +
labs(title = "Distribution of Remittances",
x = "Remittances (USD)",
y = "Count")The histogram is heavily concentrated on the left with a long tail to the right. This is classic right-skewed data.
## Create boxplot to see outliers
remit_train |>
ggplot(aes(y = remittances)) +
geom_boxplot(fill = "steelblue") +
theme_minimal() +
labs(title = "Boxplot of Remittances",
y = "Remittances (USD)")Many points appear above the upper whisker (outliers). These are likely large countries like Mexico that receive billions in remittances.
## Get summary statistics
summary(remit_train$gdp) Min. 1st Qu. Median Mean 3rd Qu.
37184925 6202697120 25470203093 307632024591 169292646245
Max.
18743803170827
GDP also shows huge range - from $21 million to $18.7 trillion.
## Create point plot for GDP
remit_train |>
ggplot(aes(gdp, 1)) +
geom_point(alpha = 0.2) +
scale_y_continuous(breaks = 0) +
labs(y = NULL, title = "Distribution of GDP") +
theme_bw() +
theme(panel.border = element_blank())GDP shows the same right-skewed pattern as remittances. Large economies have much higher GDP than small economies.
## Get summary statistics
summary(remit_train$unemployment) Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.100 3.913 6.345 7.907 10.610 34.007 178
Unemployment ranges from 0.11% to 34.01%.
## Create point plot for unemployment
remit_train |>
ggplot(aes(unemployment, 1)) +
geom_point(alpha = 0.2) +
scale_y_continuous(breaks = 0) +
labs(y = NULL, title = "Distribution of Unemployment") +
theme_bw() +
theme(panel.border = element_blank())Warning: Removed 178 rows containing missing values or values outside the scale range
(`geom_point()`).
Unemployment appears more evenly distributed than remittances or GDP.
## Get summary statistics
summary(remit_train$inflation) Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
-32.741 1.782 4.144 10.368 8.567 2240.169 32
The maximum inflation is 6,041%! (outlier) Most inflation values are between 1.7% and 9%,
## Create point plot for inflation
remit_train |>
ggplot(aes(inflation, 1)) +
geom_point(alpha = 0.2) +
scale_y_continuous(breaks = 0) +
labs(y = NULL, title = "Distribution of Inflation") +
theme_bw() +
theme(panel.border = element_blank())Warning: Removed 32 rows containing missing values or values outside the scale range
(`geom_point()`).
## Test that remittances > 0 when not missing
remit_train |>
filter(!is.na(remittances)) |>
verify(remittances > 0) |>
summarise(mean_remittances = mean(remittances, na.rm = TRUE))# A tibble: 1 × 1
mean_remittances
<dbl>
1 2659137842.
All remittances are positive. The mean is $2.55 billion.
## Test that years are between 1994 and 2024
remit_train |>
verify(year >= 1994 & year <= 2024) |>
summarise(mean_year = mean(year))# A tibble: 1 × 1
mean_year
<dbl>
1 2010.
All years are within the expected range.
## Test that unemployment is between 0 and 100
remit_train |>
filter(!is.na(unemployment)) |>
verify(unemployment >= 0 & unemployment <= 100) |>
summarise(mean_unemployment = mean(unemployment, na.rm = TRUE))# A tibble: 1 × 1
mean_unemployment
<dbl>
1 7.91
All unemployment values are valid percentages (0-100%).
Since remittances and GDP are highly right-skewed, we need to create and examine log.
## Create log-transformed versions of skewed variables
remit_train <- remit_train |>
mutate(
log_remittances = log(remittances + 1),
log_gdp = log(gdp + 1)
)We add 1 before taking the log to handle any zero values (log(0) is undefined).
## Get summary statistics for log remittances
summary(remit_train$log_remittances) Min. 1st Qu. Median Mean 3rd Qu. Max.
8.706 18.077 20.093 19.725 21.411 25.648
## Create histogram for log-transformed remittances
remit_train |>
filter(!is.na(log_remittances)) |>
ggplot(aes(x = log_remittances)) +
geom_histogram(bins = 30, fill = "darkgreen", color = "white") +
theme_minimal() +
labs(title = "Distribution of Log-Transformed Remittances",
subtitle = "Much more normal distribution after log transformation",
x = "Log(Remittances + 1)",
y = "Count")The log-transformed remittances show a much more normal distribution compared to the original right-skewed data.
## Create boxplot for log remittances
remit_train |>
filter(!is.na(log_remittances)) |>
ggplot(aes(y = log_remittances)) +
geom_boxplot(fill = "darkgreen") +
theme_minimal() +
labs(title = "Boxplot of Log-Transformed Remittances",
y = "Log(Remittances + 1)")Fewer outliers visible after log transformation.
## Get summary statistics for log GDP
summary(remit_train$log_gdp) Min. 1st Qu. Median Mean 3rd Qu. Max.
17.43 22.55 23.96 24.11 25.85 30.56
## Create histogram for log-transformed GDP
remit_train |>
filter(!is.na(log_gdp)) |>
ggplot(aes(x = log_gdp)) +
geom_histogram(bins = 30, fill = "darkblue", color = "white") +
theme_minimal() +
labs(title = "Distribution of Log-Transformed GDP",
subtitle = "More normal distribution after log transformation",
x = "Log(GDP + 1)",
y = "Count")Log GDP also shows a more normal distribution.
## Create comparison plots
p1 <- remit_train |>
filter(!is.na(remittances)) |>
ggplot(aes(x = remittances)) +
geom_histogram(bins = 30, fill = "steelblue") +
theme_minimal() +
labs(title = "Original Remittances (Right-Skewed)",
x = "Remittances (USD)")
p2 <- remit_train |>
filter(!is.na(log_remittances)) |>
ggplot(aes(x = log_remittances)) +
geom_histogram(bins = 30, fill = "darkgreen") +
theme_minimal() +
labs(title = "Log-Transformed Remittances (More Normal)",
x = "Log(Remittances + 1)")
grid.arrange(p1, p2, ncol = 2)The log transformation successfully converts the right-skewed distribution into a more normal distribution, which is better for modeling.
## Create comparison plots for GDP
p3 <- remit_train |>
filter(!is.na(gdp)) |>
ggplot(aes(x = gdp)) +
geom_histogram(bins = 30, fill = "steelblue") +
theme_minimal() +
labs(title = "Original GDP (Right-Skewed)",
x = "GDP (USD)")
p4 <- remit_train |>
filter(!is.na(log_gdp)) |>
ggplot(aes(x = log_gdp)) +
geom_histogram(bins = 30, fill = "darkblue") +
theme_minimal() +
labs(title = "Log-Transformed GDP (More Normal)",
x = "Log(GDP + 1)")
grid.arrange(p3, p4, ncol = 2)## Scatter plot with log-transformed variables
remit_train |>
filter(!is.na(log_gdp), !is.na(log_remittances)) |>
ggplot(aes(x = log_gdp, y = log_remittances)) +
geom_point(alpha = 0.3, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
theme_minimal() +
labs(title = "Log GDP vs Log Remittances",
subtitle = "Clearer linear relationship after log transformation",
x = "Log(GDP + 1)",
y = "Log(Remittances + 1)")`geom_smooth()` using formula = 'y ~ x'
The relationship between log GDP and log remittances is more linear than the original variables, which will improve model performance.
ggplot(remit_train, aes(x = deportations + 1, y = remittances + 1)) +
geom_point(alpha = 0.3, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
scale_x_log10() +
scale_y_log10() +
labs(
title = "Log–Log Relationship Between Deportations and Remittances",
x = "Log(Deportations + 1)",
y = "Log(Remittances + 1)"
) +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 372 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 372 rows containing missing values or values outside the scale range
(`geom_point()`).
## Count distinct country names
n_distinct(remit_train$country_name)[1] 150
We have 158 different countries in the dataset.
## Show how many observations per country
head(table(remit_train$country_name), 20)
Afghanistan Albania Algeria Angola
15 25 28 16
Antigua and Barbuda Argentina Armenia Australia
25 28 21 24
Austria Azerbaijan Bangladesh Barbados
28 24 27 21
Belarus Belgium Belize Benin
21 20 23 22
Bermuda Bhutan Bolivia Botswana
17 14 27 24
Most countries have between 20-30 observations, representing roughly 20-30 years of data.
## Count observations per country and plot top 20
remit_train |>
count(country_name, sort = TRUE) |>
slice_head(n = 20) |>
ggplot(aes(x = reorder(country_name, n), y = n)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
theme_minimal() +
labs(title = "Top 20 Countries by Number of Observations",
x = "Country",
y = "Count")Countries are fairly evenly represented. Bahrain has the most observations (30), while several countries have around 20-29 observations.
remit_train %>%
filter(year == 2024) %>%
slice_max(remittances, n = 15) %>%
ggplot(aes(
x = fct_reorder(country_name, remittances),
y = remittances / 1e9
)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "Top 15 Remittance Receivers (2024)",
x = "Country",
y = "Remittances (Billions USD)"
) +
theme_minimal()# Top 15 by remittances/GDP (2024)
remit_train %>%
filter(year == 2024) %>%
slice_max(remittances_gdp, n = 15) %>% # preferred over top_n()
mutate(`Country Name` = fct_reorder(country_name, remittances_gdp)) %>%
ggplot(aes(x = `Country Name`, y = remittances_gdp)) +
geom_col(fill = "coral") +
coord_flip() +
labs(
title = "Top 15 Countries: Remittances as % GDP (2024)",
x = "Country",
y = "Remittances (% GDP)"
) +
theme_minimal()remit_train |>
group_by(country_name) |>
summarize(mean_ratio = mean(remittances_gdp, na.rm = TRUE)) |>
arrange(desc(mean_ratio)) |>
slice_head(n = 10)# A tibble: 10 × 2
country_name mean_ratio
<chr> <dbl>
1 Lesotho 39.0
2 Tonga 31.2
3 Bermuda 21.0
4 Nepal 20.3
5 Lebanon 20.0
6 Samoa 19.2
7 El Salvador 18.2
8 Kosovo 16.8
9 Jordan 16.0
10 Jamaica 14.6
top10 <- remit_train |>
group_by(country_name) |>
summarize(mean_ratio = mean(remittances_gdp, na.rm = TRUE)) |>
arrange(desc(mean_ratio)) |>
slice_head(n = 10)
ggplot(top10, aes(x = reorder(country_name, mean_ratio), y = mean_ratio)) +
geom_col(fill = "coral") +
coord_flip() +
labs(
title = "Top 10 Countries by Average Remittances % of GDP",
x = "Country",
y = "Average Remittances/GDP"
) +
theme_minimal()remit_train |>
filter(country_name %in% c(
"Nicaragua", "El Salvador", "Honduras", "Guatemala", "Haiti", "India"
)) |>
ggplot(aes(x = year, y = remittances_gdp, color = country_name)) +
geom_line(linewidth = 1) +
scale_y_log10() +
labs(
title = "Remittance Trends Over Time",
subtitle = "Selected Countries (log scale)",
x = "Year",
y = "Remittances (% of GDP, log scale)",
color = "Country"
) +
theme_minimal()#Start assessing countries of interes
countries_of_interest <- c( "Nicaragua", "El Salvador", "Honduras", "Guatemala",
"Haiti", "India")
filtered <- remit_train |>
filter(country_name %in% countries_of_interest)
ggplot(filtered, aes(log(stock), log(remittances), color = country_name)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(
x = "log(stock)",
y = "log(remittances)",
title = "Stock–Remittance Relationship for Selected Countries",
color = "Country"
) +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 11 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 11 rows containing missing values or values outside the scale range
(`geom_point()`).
#Comment: comparing the stock–Remittance Relationship for Selected Countries## Scatter plot with trend line
remit_train |>
ggplot(aes(x = gdp, y = remittances)) +
geom_point(alpha = 0.3, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
theme_minimal() +
labs(title = "Relationship Between GDP and Remittances (Original Scale)",
x = "GDP (USD)",
y = "Remittances (USD)")`geom_smooth()` using formula = 'y ~ x'
There is a clear positive relationship. Countries with larger economies (higher GDP) tend to receive more remittances in absolute dollar amounts. The red line shows the linear trend.
## Scatter plot with log-transformed variables
remit_train |>
filter(!is.na(log_gdp), !is.na(log_remittances)) |>
ggplot(aes(x = log_gdp, y = log_remittances)) +
geom_point(alpha = 0.3, color = "darkgreen") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
theme_minimal() +
labs(title = "Log GDP vs Log Remittances (Log Scale - Better Linear Fit)",
subtitle = "This relationship is more appropriate for linear regression models",
x = "Log(GDP + 1)",
y = "Log(Remittances + 1)")`geom_smooth()` using formula = 'y ~ x'
The log-transformed relationship is more linear and will produce better model predictions.
## Scatter plot with trend line
remit_train |>
ggplot(aes(x = gdp_per, y = remittances_gdp)) +
geom_point(alpha = 0.3, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
theme_minimal() +
labs(title = "GDP Per Capita vs Remittances as % of GDP",
x = "GDP Per Capita (USD)",
y = "Remittances as % of GDP")`geom_smooth()` using formula = 'y ~ x'
Poorer countries (lower GDP per capita) depend more heavily on remittances as a percentage of their economy. Richer countries receive remittances but they represent a smaller share of their total GDP.
## Scatter plot with trend line
remit_train |>
ggplot(aes(x = unemployment, y = remittances)) +
geom_point(alpha = 0.3, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
theme_minimal() +
labs(title = "Unemployment vs Remittances",
x = "Unemployment Rate (%)",
y = "Remittances (USD)")`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 178 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 178 rows containing missing values or values outside the scale range
(`geom_point()`).
There is a slight negative relationship (it is weak). Unemployment doesn’t appear to be a strong predictor of remittances.
## Calculate correlations between all numeric variables
remit_train |>
select(where(is.numeric)) |>
cor(use = "complete.obs") |>
round(2) year remittances remittances_gdp gdp stock unemployment
year 1.00 0.18 0.03 0.12 0.00 -0.09
remittances 0.18 1.00 0.02 0.48 0.49 -0.16
remittances_gdp 0.03 0.02 1.00 -0.22 0.04 0.03
gdp 0.12 0.48 -0.22 1.00 0.19 -0.11
stock 0.00 0.49 0.04 0.19 1.00 -0.14
unemployment -0.09 -0.16 0.03 -0.11 -0.14 1.00
gdp_per 0.19 0.00 -0.40 0.18 -0.09 -0.09
inflation -0.09 -0.06 0.04 -0.13 0.00 -0.01
vulnerable_emp -0.05 0.05 0.39 -0.09 0.06 -0.12
maternal_mortality -0.05 0.11 0.12 -0.10 0.00 -0.18
exchange_rate 0.04 0.03 -0.06 -0.04 -0.05 -0.07
deportations -0.06 0.30 0.22 0.01 0.77 -0.13
internet 0.66 0.06 -0.32 0.19 -0.08 0.01
poverty -0.24 0.02 0.19 -0.11 0.04 -0.10
dist_pop 0.05 0.09 -0.08 0.14 -0.16 -0.08
dist_cap 0.05 0.09 -0.09 0.14 -0.18 -0.06
terror -0.08 0.24 0.26 0.05 0.23 -0.10
gdp_lag 0.20 0.00 -0.39 0.18 -0.09 -0.08
deportations_lag -0.06 0.31 0.22 0.01 0.80 -0.13
log_remittances 0.23 0.73 0.12 0.39 0.32 -0.14
log_gdp 0.14 0.46 -0.59 0.62 0.20 -0.09
gdp_per inflation vulnerable_emp maternal_mortality
year 0.19 -0.09 -0.05 -0.05
remittances 0.00 -0.06 0.05 0.11
remittances_gdp -0.40 0.04 0.39 0.12
gdp 0.18 -0.13 -0.09 -0.10
stock -0.09 0.00 0.06 0.00
unemployment -0.09 -0.01 -0.12 -0.18
gdp_per 1.00 -0.32 -0.63 -0.34
inflation -0.32 1.00 0.13 0.16
vulnerable_emp -0.63 0.13 1.00 0.61
maternal_mortality -0.34 0.16 0.61 1.00
exchange_rate -0.19 0.08 0.30 0.25
deportations -0.15 0.01 0.10 0.01
internet 0.70 -0.26 -0.62 -0.41
poverty -0.44 0.18 0.70 0.70
dist_pop -0.20 0.08 0.27 0.20
dist_cap -0.16 0.06 0.24 0.19
terror -0.59 0.30 0.58 0.40
gdp_lag 0.99 -0.32 -0.63 -0.33
deportations_lag -0.14 0.01 0.09 0.01
log_remittances 0.07 -0.12 0.01 0.03
log_gdp 0.50 -0.14 -0.38 -0.16
exchange_rate deportations internet poverty dist_pop
year 0.04 -0.06 0.66 -0.24 0.05
remittances 0.03 0.30 0.06 0.02 0.09
remittances_gdp -0.06 0.22 -0.32 0.19 -0.08
gdp -0.04 0.01 0.19 -0.11 0.14
stock -0.05 0.77 -0.08 0.04 -0.16
unemployment -0.07 -0.13 0.01 -0.10 -0.08
gdp_per -0.19 -0.15 0.70 -0.44 -0.20
inflation 0.08 0.01 -0.26 0.18 0.08
vulnerable_emp 0.30 0.10 -0.62 0.70 0.27
maternal_mortality 0.25 0.01 -0.41 0.70 0.20
exchange_rate 1.00 -0.04 -0.18 0.32 0.37
deportations -0.04 1.00 -0.19 0.11 -0.23
internet -0.18 -0.19 1.00 -0.60 -0.14
poverty 0.32 0.11 -0.60 1.00 0.24
dist_pop 0.37 -0.23 -0.14 0.24 1.00
dist_cap 0.36 -0.25 -0.11 0.22 1.00
terror 0.21 0.23 -0.53 0.48 0.28
gdp_lag -0.19 -0.15 0.71 -0.44 -0.19
deportations_lag -0.04 0.95 -0.18 0.10 -0.23
log_remittances 0.06 0.23 0.16 -0.02 0.04
log_gdp 0.02 0.01 0.43 -0.24 0.09
dist_cap terror gdp_lag deportations_lag log_remittances
year 0.05 -0.08 0.20 -0.06 0.23
remittances 0.09 0.24 0.00 0.31 0.73
remittances_gdp -0.09 0.26 -0.39 0.22 0.12
gdp 0.14 0.05 0.18 0.01 0.39
stock -0.18 0.23 -0.09 0.80 0.32
unemployment -0.06 -0.10 -0.08 -0.13 -0.14
gdp_per -0.16 -0.59 0.99 -0.14 0.07
inflation 0.06 0.30 -0.32 0.01 -0.12
vulnerable_emp 0.24 0.58 -0.63 0.09 0.01
maternal_mortality 0.19 0.40 -0.33 0.01 0.03
exchange_rate 0.36 0.21 -0.19 -0.04 0.06
deportations -0.25 0.23 -0.15 0.95 0.23
internet -0.11 -0.53 0.71 -0.18 0.16
poverty 0.22 0.48 -0.44 0.10 -0.02
dist_pop 1.00 0.28 -0.19 -0.23 0.04
dist_cap 1.00 0.25 -0.15 -0.25 0.05
terror 0.25 1.00 -0.59 0.24 0.24
gdp_lag -0.15 -0.59 1.00 -0.14 0.08
deportations_lag -0.25 0.24 -0.14 1.00 0.23
log_remittances 0.05 0.24 0.08 0.23 1.00
log_gdp 0.11 -0.08 0.50 0.02 0.54
log_gdp
year 0.14
remittances 0.46
remittances_gdp -0.59
gdp 0.62
stock 0.20
unemployment -0.09
gdp_per 0.50
inflation -0.14
vulnerable_emp -0.38
maternal_mortality -0.16
exchange_rate 0.02
deportations 0.01
internet 0.43
poverty -0.24
dist_pop 0.09
dist_cap 0.11
terror -0.08
gdp_lag 0.50
deportations_lag 0.02
log_remittances 0.54
log_gdp 1.00
## Create visual correlation matrix (Eva's example)
remit_train |>
select(where(is.numeric)) |>
cor(use = "complete.obs") |>
corrplot(method = "color",
type = "upper",
tl.col = "black",
tl.srt = 45,
title = "Correlation Matrix",
mar = c(0,0,2,0))## Create visual correlation matrix (Gaby's example)
library(reshape2)
Attaching package: 'reshape2'
The following object is masked from 'package:tidyr':
smiths
#Create variable to represent numeric vars
numeric_vars_log <- remit_train %>%
select(remittances_gdp, remittances, stock, unemployment, gdp_per,
inflation, internet, dist_cap, terror) %>%
mutate(
remittances_gdp = log1p(remittances_gdp),
remittances = log1p(remittances),
stock = log1p(stock), # migrant stock
gdp_per = log1p(gdp_per),
internet = log1p(internet),
dist_cap = log1p(dist_cap)
) %>%
na.omit()
cor_matrix_log <- cor(numeric_vars_log, use = "complete.obs")
melted_cor_log <- melt(cor_matrix_log)
ggplot(melted_cor_log, aes(Var1, Var2, fill = value)) +
geom_tile() +
geom_text(aes(label = round(value, 2)), size = 3) +
scale_fill_gradient2(
low = "blue", mid = "white", high = "red",
midpoint = 0, limits = c(-1, 1)
) +
labs(title = "Correlation Heatmap (Log-Transformed Variables)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))## Calculate average remittances per year and plot
remit_train |>
group_by(year) |>
summarise(avg_remittances = mean(remittances, na.rm = TRUE)) |>
ggplot(aes(x = year, y = avg_remittances)) +
geom_line(color = "steelblue", linewidth = 1) +
geom_point(color = "steelblue") +
theme_minimal() +
labs(title = "Average Remittances Over Time (1994-2024)",
x = "Year",
y = "Average Remittances (USD)")Remittances show a clear upward trend over 30 years. We can see:
## Calculate average remittances as % of GDP per year and plot
remit_train |>
group_by(year) |>
summarise(avg_remittances_gdp = mean(remittances_gdp, na.rm = TRUE)) |>
ggplot(aes(x = year, y = avg_remittances_gdp)) +
geom_line(color = "steelblue", linewidth = 1) +
geom_point(color = "steelblue") +
theme_minimal() +
labs(title = "Average Remittances as % of GDP Over Time",
x = "Year",
y = "Remittances as % of GDP")Remittances as a percentage of GDP have stayed relatively stable around 3-4% over time. This means remittances are growing roughly in line with GDP growth, not becoming more or less important to economies over time.
It is likely that past changes to country of origin conditions is likely to be more illustrative of future remittances than current conditions. For example, while a downward shock in GDP may influence current migration, migrants may take time to settle into the US and thus begin remitting back home.
These lagged effect would seem to be the most plausible with GDP, unemployment, terror, deportations, and changes in migrant stock (inward migration)
## Lagged (1 year)
remit_lag <- remit_train |>
mutate(year = as.numeric(year)) |>
arrange(country_name, year) |>
group_by(country_name) |>
mutate(
gdp_lag = lag(gdp_per),
unemp_lag = lag(unemployment),
terror_lag = lag(terror),
deportations_lag = lag(deportations),
stock_lag = lag(stock),
) |>
ungroup()
# Verifying that the lag worked.
remit_lag |>
select(country_name, year, gdp_per, gdp_lag) |>
arrange(country_name, year) |>
filter(!is.na(gdp_lag)) |>
slice_head(n = 10)# A tibble: 10 × 4
country_name year gdp_per gdp_lag
<chr> <dbl> <dbl> <dbl>
1 Afghanistan 2009 452. 382.
2 Afghanistan 2010 561. 452.
3 Afghanistan 2011 607. 561.
4 Afghanistan 2012 651. 607.
5 Afghanistan 2013 637. 651.
6 Afghanistan 2015 566. 637.
7 Afghanistan 2016 522. 566.
8 Afghanistan 2017 525. 522.
9 Afghanistan 2018 491. 525.
10 Afghanistan 2019 497. 491.
## Lagged predictors relationship with remittances (as % of GDP)
## GDP per capita
# Lagged vs Unlagged
remit_lag |>
pivot_longer(cols = c(gdp_per, gdp_lag),
names_to = "type",
values_to = "value") |>
ggplot(aes(value, remittances_gdp, color = type)) +
geom_point(alpha = 0.3) +
geom_smooth(se = FALSE) +
theme_minimal() +
labs(title = "Lagged vs Current GDP per Capita",
x = "GDP per capita",
color = "Variable") ## Doesn't Necessarily Improve Model Fit. Overall there seem to be better ways to verify whether lagged variable would improve the interpretability of our models.
## Comparing Lagged vs Current GDP per capita for key countries.
remit_lag |>
filter(country_name %in% c("Nicaragua", "El Salvador", "Honduras",
"Guatemala", "Haiti", "India")) |>
pivot_longer(
cols = c(gdp_per, gdp_lag),
names_to = "gdp_type",
values_to = "gdp_value"
) |>
ggplot(aes(x = gdp_value, y = remittances_gdp,
color = country_name, linetype = gdp_type)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal() +
labs(
title = "Lagged vs Current GDP per Capita",
x = "GDP per capita (current or lagged)",
linetype = "GDP variable"
)Suggestion is that Lagged GDP demonstrates a slightly stronger relationship and thus may improve model fit. Thus it may seem that shocks or changes to prior GDP could help explain current remittances amounts.
## Comparing Lagged vs Current Unemployment for key countries.
remit_lag |>
filter(country_name %in% c("Nicaragua", "El Salvador", "Honduras",
"Guatemala", "Haiti", "India")) |>
pivot_longer(
cols = c(unemployment, unemp_lag),
names_to = "unemp_type",
values_to = "unemp_value"
) |>
ggplot(aes(x = unemp_value, y = remittances_gdp,
color = country_name, linetype = unemp_type)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal() +
labs(
title = "Lagged vs Current Unemployment",
x = "Unemployment (current or lagged)",
linetype = "GDP variable"
)For most countries lagged unemployment does not seem to alter model fit substantially for any country other than Haiti.
It likely won’t improve our model fit and thus shouldn’t be included.
## Comparing Lagged vs Current Terror for key countries.
remit_lag |>
filter(country_name %in% c("Nicaragua", "El Salvador", "Honduras",
"Guatemala", "Haiti", "India")) |>
pivot_longer(
cols = c(terror, terror_lag),
names_to = "terror_type",
values_to = "terror_value"
) |>
ggplot(aes(x = terror_value, y = remittances_gdp,
color = country_name, linetype = terror_type)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal() +
labs(
title = "Lagged vs Current Terror",
x = "Terror (current or lagged)",
linetype = "Terror variable"
)It seems terror varies less, and lagged terror levels may not be too explanatory
## Comparing Lagged vs Current Deportations for key countries.
remit_lag |>
filter(country_name %in% c("Nicaragua", "El Salvador", "Honduras",
"Guatemala", "Haiti", "India")) |>
pivot_longer(
cols = c(deportations, deportations_lag),
names_to = "deportations_type",
values_to = "deportations_value"
) |>
ggplot(aes(x = deportations_value, y = remittances_gdp,
color = country_name, linetype = deportations_type)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal() +
labs(
title = "Lagged vs Current Deportations",
x = "Deportations (current or lagged)",
linetype = "Deportations variable"
)Much stronger relationship for key countries in including the lagged effects of deportations in explaining future remittances.
## Comparing Lagged vs Current Deportations for key countries.
remit_lag |>
filter(country_name %in% c("Nicaragua", "El Salvador", "Honduras",
"Guatemala", "Haiti", "India")) |>
pivot_longer(
cols = c(stock, stock_lag),
names_to = "stock_type",
values_to = "stock_value"
) |>
ggplot(aes(x = stock_value, y = remittances_gdp,
color = country_name, linetype = stock_type)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal() +
labs(
title = "Lagged vs Current Changes in Migrant Stock ",
x = "Migrant Stock (current or lagged)",
linetype = "Migrant Stock variable"
)Less Strong change and thus probably doesn’t warrant inclusion.
Takeaways:
Predictive power of lagged deportations and GDP improve model fit the best (shift the slopes of our relationships most).
For the other predictors, including changes to migrant stock, terror, and unemployment the relationships for our key variables barely changed indicating no changes in explanatory power.
Next Steps: using step_mutate in our recipe to add lags to our gdp_per and deportations would account for this.
## To improve the accuracy of our estimated error rates, we set up a 10-fold cross validation with 5 repetitions since we have a relatively small number of observations within the training data
remit_folds <- vfold_cv(data = remit_train, v = 10, repeats = 5)## Recipe (baseline model)
recipe_baseline <-
recipe(remittances_gdp ~ stock + gdp_per + gdp_lag + unemployment +
dist_cap + terror + deportations + deportations_lag +
internet + inflation,
data = remit_train) |>
step_impute_median(all_numeric_predictors()) |>
step_impute_mode(all_nominal_predictors()) |>
step_mutate(gdp_per = log(gdp_per + 1)) |>
step_normalize(all_numeric_predictors())
## Processing the full training data using parameter specification.
bake(prep(recipe_baseline, training = remit_train), new_data = remit_train)# A tibble: 3,292 × 11
stock gdp_per gdp_lag unemployment dist_cap terror deportations
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -0.163 1.00 -0.414 1.59 -0.473 -1.20 -0.155
2 -0.163 -1.31 -0.615 -0.517 1.13 -0.288 -0.161
3 -0.0870 -0.0360 -0.409 0.756 0.240 0.627 -0.145
4 -0.163 1.67 2.28 -0.471 -2.18 -0.288 -0.142
5 0.145 0.0499 -0.443 0.373 -0.226 0.627 -0.150
6 -0.163 -0.177 -0.475 0.388 -0.465 -0.288 -0.155
7 -0.163 0.112 -0.378 0.652 -0.112 -0.288 -0.155
8 -0.0916 -0.559 -0.537 -0.968 1.51 -0.288 -0.142
9 -0.163 1.37 1.22 1.14 1.41 -0.288 -0.161
10 -0.241 0.346 -0.250 -0.488 0.283 0.627 -0.158
# ℹ 3,282 more rows
# ℹ 4 more variables: deportations_lag <dbl>, internet <dbl>, inflation <dbl>,
# remittances_gdp <dbl>
In this section, we compare five different regression models to predict remittances as a percentage of GDP.
## Linear model
lm_baseline <- linear_reg() |>
set_mode(mode = "regression") |>
set_engine(engine = "lm")
## Create workflow
lm_workflow <- workflow() |>
add_recipe(recipe_baseline) |>
add_model(lm_baseline)
## Fit model
lm_results <- lm_workflow |>
fit_resamples(resamples = remit_folds)
## Collect RMSE
collect_metrics(lm_results)# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 6.47 50 0.153 Preprocessor1_Model1
2 rsq standard 0.0762 50 0.00346 Preprocessor1_Model1
Establishes a baseline performance using ordinary least squares regression with no regularization. This gives us a benchmark to compare other models against.
library(tidymodels)
library(glmnet)Loading required package: Matrix
Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':
expand, pack, unpack
Loaded glmnet 4.1-10
library(dplyr)
#We clean remittances before folding due to missing values
train_data2 <- remit_train %>%
filter(!is.na(remittances_gdp))
remit_folds <- vfold_cv(train_data2, v = 10)
#At first, glmnet was throwing errors, so we need to create a recipe that forces
#your predictors into a form that ridge/lasso (glmnet) can handle, with no NA, no Inf / -Inf
#no constant columns, comparable scales across predictors.
recipe_glmnet <- recipe_baseline %>%
step_mutate(across(all_numeric_predictors(),
~ if_else(is.finite(.x), .x, NA_real_))) %>%
step_impute_median(all_numeric_predictors()) %>%
step_impute_mode(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors())
ctrl <- control_grid(save_pred = TRUE, verbose = TRUE)
grid30 <- grid_regular(penalty(), levels = 30)
metrics1 <- metric_set(rmse)
#This control object makes errors visible and traceable#this defines ridge regression with a tuned penalty.
ridge_spec <- linear_reg(penalty = tune(), mixture = 0) %>%
set_mode("regression") %>%
set_engine("glmnet")
# build the workflow (recipe + model)
ridge_wf <- workflow() %>%
add_recipe(recipe_glmnet) %>%
add_model(ridge_spec)
#We tune the penalty using cross-validation
ridge_res <- tune_grid(
ridge_wf,
resamples = remit_folds,
grid = grid30,
metrics = metrics1,
control = ctrl
)
best_ridge <- select_best(ridge_res, metric = "rmse")
final_ridge_wf <- finalize_workflow(ridge_wf, best_ridge)
ridge_fit <- fit(final_ridge_wf, data = train_data2)
# results
best_ridge# A tibble: 1 × 2
penalty .config
<dbl> <chr>
1 0.204 Preprocessor1_Model28
final_ridge_wf══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
9 Recipe Steps
• step_impute_median()
• step_impute_mode()
• step_mutate()
• step_normalize()
• step_mutate()
• step_impute_median()
• step_impute_mode()
• step_zv()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)
Main Arguments:
penalty = 0.204335971785695
mixture = 0
Computational engine: glmnet
ridge_fit ══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
9 Recipe Steps
• step_impute_median()
• step_impute_mode()
• step_mutate()
• step_normalize()
• step_mutate()
• step_impute_median()
• step_impute_mode()
• step_zv()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "gaussian", alpha = ~0)
Df %Dev Lambda
1 10 0.00 1217.00
2 10 0.11 1109.00
3 10 0.12 1010.00
4 10 0.13 920.50
5 10 0.14 838.70
6 10 0.15 764.20
7 10 0.17 696.30
8 10 0.19 634.40
9 10 0.20 578.10
10 10 0.22 526.70
11 10 0.24 479.90
12 10 0.26 437.30
13 10 0.29 398.40
14 10 0.31 363.00
15 10 0.34 330.80
16 10 0.37 301.40
17 10 0.41 274.60
18 10 0.44 250.20
19 10 0.48 228.00
20 10 0.53 207.70
21 10 0.57 189.30
22 10 0.62 172.50
23 10 0.67 157.20
24 10 0.73 143.20
25 10 0.79 130.50
26 10 0.86 118.90
27 10 0.93 108.30
28 10 1.00 98.70
29 10 1.08 89.93
30 10 1.16 81.94
31 10 1.24 74.66
32 10 1.34 68.03
33 10 1.43 61.98
34 10 1.53 56.48
35 10 1.64 51.46
36 10 1.74 46.89
37 10 1.86 42.72
38 10 1.97 38.93
39 10 2.09 35.47
40 10 2.22 32.32
41 10 2.35 29.45
42 10 2.48 26.83
43 10 2.61 24.45
44 10 2.74 22.28
45 10 2.88 20.30
46 10 3.02 18.49
...
and 54 more lines.
The ridge regression regularization indicated increasing deviance as the penalty decreased, with cross-validation selecting a penalty effectively equal to zero. This suggests that the unpenalized linear model already provides an optimal fit for the data.
# we specify the LASSO model
lasso_spec <- linear_reg(penalty = tune(), mixture = 1) %>%
set_mode("regression") %>%
set_engine("glmnet")
# we build the workflow - preprocess to prevent leakage
lasso_wf <- workflow() %>%
add_recipe(recipe_glmnet) %>%
add_model(lasso_spec)
# Tune lambda penalty using cross-validation
lasso_res <- tune_grid(
lasso_wf,
resamples = remit_folds,
grid = grid30,
metrics = metrics1,
control = ctrl
)
# choose the best lambda (lowest RMSE)
best_lasso <- select_best(lasso_res, metric = "rmse")
# finalize the workflow
final_lasso_wf <- finalize_workflow(lasso_wf, best_lasso)
# fit the final LASSO model on all training data
lasso_fit <- fit(final_lasso_wf, data = train_data2)
best_lasso# A tibble: 1 × 2
penalty .config
<dbl> <chr>
1 0.0418 Preprocessor1_Model26
final_lasso_wf══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
9 Recipe Steps
• step_impute_median()
• step_impute_mode()
• step_mutate()
• step_normalize()
• step_mutate()
• step_impute_median()
• step_impute_mode()
• step_zv()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)
Main Arguments:
penalty = 0.0417531893656041
mixture = 1
Computational engine: glmnet
lasso_fit══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
9 Recipe Steps
• step_impute_median()
• step_impute_mode()
• step_mutate()
• step_normalize()
• step_mutate()
• step_impute_median()
• step_impute_mode()
• step_zv()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "gaussian", alpha = ~1)
Df %Dev Lambda
1 0 0.00 1.21700
2 1 0.54 1.10900
3 1 0.99 1.01000
4 1 1.37 0.92050
5 1 1.68 0.83870
6 1 1.94 0.76420
7 2 2.36 0.69630
8 2 2.72 0.63440
9 2 3.02 0.57810
10 2 3.26 0.52670
11 3 3.49 0.47990
12 3 3.77 0.43730
13 3 4.00 0.39840
14 4 4.20 0.36300
15 5 4.42 0.33080
16 5 4.61 0.30140
17 6 4.89 0.27460
18 6 5.19 0.25020
19 7 5.56 0.22800
20 7 5.90 0.20770
21 7 6.18 0.18930
22 7 6.41 0.17250
23 7 6.61 0.15720
24 8 6.79 0.14320
25 9 6.95 0.13050
26 9 7.10 0.11890
27 9 7.21 0.10830
28 9 7.31 0.09870
29 9 7.39 0.08993
30 9 7.46 0.08194
31 9 7.51 0.07466
32 9 7.56 0.06803
33 9 7.60 0.06198
34 9 7.63 0.05648
35 9 7.66 0.05146
36 9 7.68 0.04689
37 9 7.70 0.04272
38 9 7.71 0.03893
39 9 7.72 0.03547
40 10 7.74 0.03232
41 10 7.75 0.02945
42 10 7.76 0.02683
43 10 7.76 0.02445
44 10 7.77 0.02228
45 10 7.78 0.02030
46 10 7.78 0.01849
...
and 24 more lines.
tidy(lasso_fit) %>%
filter(term != "(Intercept)") %>%
filter(estimate != 0) %>%
arrange(desc(abs(estimate)))# A tibble: 9 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 gdp_per -2.03 0.0418
2 stock -0.859 0.0418
3 internet 0.794 0.0418
4 deportations_lag 0.738 0.0418
5 deportations 0.660 0.0418
6 unemployment 0.556 0.0418
7 dist_cap -0.386 0.0418
8 terror -0.257 0.0418
9 inflation -0.102 0.0418
# RMSE comparison
bind_rows(
collect_metrics(ridge_res) |>
filter(.metric == "rmse") |>
inner_join(best_ridge, by = "penalty") |>
mutate(model = "ridge"),
collect_metrics(lasso_res) |>
filter(.metric == "rmse") |>
inner_join(best_lasso, by = "penalty") |>
mutate(model = "lasso")
) |>
select(model, penalty, mean, std_err)# A tibble: 2 × 4
model penalty mean std_err
<chr> <dbl> <dbl> <dbl>
1 ridge 0.204 6.46 0.370
2 lasso 0.0418 6.46 0.369
The LASSO model selected a set of nine predictors. Remittance intensity is negatively associated with GDP per capita and macroeconomic instability, while unemployment, deportations, and internet access exhibit positive relationships, consistent with counter-cyclical and transaction-cost mechanisms.
We use this model a regularizaiton model, to reduce variance error by reducing the important of less important predictors. It is a bagging algorithm which considers each split and divides it into only useful predictors - It uses two hyper paramaters mtry which considers x predictors in each split (it can be tuned to optimal value of useful predictors. and min_n to stop spliting the data
library(vip)
# For missingness inside the dependent variable
remit_train_clean <- remit_train |>
filter(!is.na(remittances_gdp))
# Smaller CV for run time optimization
rf_folds <- vfold_cv(remit_train_clean, v = 5)
## Given the high degree of missingness a recipe that accounts for nas will avoid it from breaking down using median imputation.
recipe_alt <-
recipe(remittances_gdp ~ stock + gdp_per + gdp_lag + unemployment +
dist_cap + terror + deportations + deportations_lag +
internet + inflation + country_name,
data = remit_train_clean) |>
update_role(country_name, new_role = "id") |>
step_impute_median(all_numeric_predictors()) |>
step_mutate(gdp_per = log(gdp_per + 1)) |>
step_normalize(all_numeric_predictors())
bake(prep(recipe_alt, training = remit_train_clean), new_data = remit_train_clean)# A tibble: 3,292 × 12
stock gdp_per gdp_lag unemployment dist_cap terror deportations
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -0.163 1.00 -0.414 1.59 -0.473 -1.20 -0.155
2 -0.163 -1.31 -0.615 -0.517 1.13 -0.288 -0.161
3 -0.0870 -0.0360 -0.409 0.756 0.240 0.627 -0.145
4 -0.163 1.67 2.28 -0.471 -2.18 -0.288 -0.142
5 0.145 0.0499 -0.443 0.373 -0.226 0.627 -0.150
6 -0.163 -0.177 -0.475 0.388 -0.465 -0.288 -0.155
7 -0.163 0.112 -0.378 0.652 -0.112 -0.288 -0.155
8 -0.0916 -0.559 -0.537 -0.968 1.51 -0.288 -0.142
9 -0.163 1.37 1.22 1.14 1.41 -0.288 -0.161
10 -0.241 0.346 -0.250 -0.488 0.283 0.627 -0.158
# ℹ 3,282 more rows
# ℹ 5 more variables: deportations_lag <dbl>, internet <dbl>, inflation <dbl>,
# country_name <chr>, remittances_gdp <dbl>
## Creating a Random Forest Model set up for tuning
rf_mod <- rand_forest(
trees = tune(),
mtry = tune(),
min_n = tune()) |>
set_mode(mode = "regression") |>
set_engine(engine = "ranger",
importance = "impurity",
num.threads = 4)
## Creating a workflow. Need to use alternative specific because it accounts for missingness which will lead the model to fail.
rf_wf <- workflow() |>
add_recipe(recipe_alt) |>
add_model(rf_mod)
## Finalize parameters
rf_params <- rf_wf |>
extract_parameter_set_dials() |>
finalize(remit_train_clean)
rf_params
## tuning grid
rf_grid <- grid_max_entropy(
rf_params,
size = 20 )
## Tuning it within cross validation using our hyperparamters.
rf_tuned <- rf_wf |>
tune_grid(
resamples = rf_folds,
grid = rf_grid,
control = control_grid(save_pred = TRUE))
## Measuring the RMSE
rf_tuned |>
collect_metrics()# A tibble: 40 × 9
mtry trees min_n .metric .estimator mean n std_err .config
<int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 20 1868 19 rmse standard 3.77 5 0.537 Preprocessor1_Model…
2 20 1868 19 rsq standard 0.718 5 0.0420 Preprocessor1_Model…
3 17 1054 3 rmse standard 3.41 5 0.545 Preprocessor1_Model…
4 17 1054 3 rsq standard 0.772 5 0.0437 Preprocessor1_Model…
5 9 1913 23 rmse standard 3.87 5 0.540 Preprocessor1_Model…
6 9 1913 23 rsq standard 0.703 5 0.0406 Preprocessor1_Model…
7 15 1724 3 rmse standard 3.41 5 0.540 Preprocessor1_Model…
8 15 1724 3 rsq standard 0.772 5 0.0428 Preprocessor1_Model…
9 16 1797 30 rmse standard 4.04 5 0.527 Preprocessor1_Model…
10 16 1797 30 rsq standard 0.671 5 0.0371 Preprocessor1_Model…
# ℹ 30 more rows
rf_tuned |>
show_best(metric = "rmse", n = 10)# A tibble: 10 × 9
mtry trees min_n .metric .estimator mean n std_err .config
<int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 15 1724 3 rmse standard 3.41 5 0.540 Preprocessor1_Model…
2 17 1054 3 rmse standard 3.41 5 0.545 Preprocessor1_Model…
3 14 319 8 rmse standard 3.50 5 0.525 Preprocessor1_Model…
4 14 1185 14 rmse standard 3.66 5 0.541 Preprocessor1_Model…
5 3 1582 7 rmse standard 3.73 5 0.589 Preprocessor1_Model…
6 20 1868 19 rmse standard 3.77 5 0.537 Preprocessor1_Model…
7 6 12 6 rmse standard 3.82 5 0.461 Preprocessor1_Model…
8 22 1048 22 rmse standard 3.85 5 0.526 Preprocessor1_Model…
9 2 737 4 rmse standard 3.85 5 0.594 Preprocessor1_Model…
10 9 1913 23 rmse standard 3.87 5 0.540 Preprocessor1_Model…
## selecting the best specification and fit it to the full training data.
best_rf <-rf_tuned |>
select_best(metric = "rmse")
final_rf_wf <- rf_wf |>
finalize_workflow(best_rf)
final_rf_fit <- final_rf_wf |>
fit(data = remit_train_clean)
# Variable importance
final_rf_fit |>
extract_fit_parsnip() |>
vip(num_features = 10)KNN predicts remittances by averaging the values of the K most similar country-year observations.
## Create lagged variables by country
remit_train_lagged <- remit_train |>
arrange(country_name, year) |>
group_by(country_name) |>
mutate(
gdp_lag = lag(gdp_per),
deportations_lag = lag(deportations)
) |>
ungroup()
## Remove missing values
remit_train_clean_knn <- remit_train_lagged |>
select(remittances_gdp, stock, gdp_per, unemployment, dist_cap,
terror, deportations, internet, inflation, gdp_lag, deportations_lag) |>
drop_na()
## Cross-validation setup
set.seed(20251211)
knn_folds <- vfold_cv(data = remit_train_clean_knn, v = 10, repeats = 5)
## Recipe: log transform and normalize
recipe_knn <-
recipe(remittances_gdp ~ ., data = remit_train_clean_knn) |>
step_mutate(gdp_per = log(gdp_per + 1)) |>
step_normalize(all_numeric_predictors())
## Model: KNN with tunable K
knn_mod <-
nearest_neighbor(neighbors = tune()) |>
set_engine("kknn") |>
set_mode("regression")
## Grid: test K from 1 to 99
knn_grid <- grid_regular(neighbors(range = c(1, 99)), levels = 10)
## Workflow
knn_workflow <-
workflow() |>
add_recipe(recipe_knn) |>
add_model(knn_mod)
## Tune
knn_results <-
knn_workflow |>
tune_grid(
resamples = knn_folds,
grid = knn_grid,
metrics = metric_set(rmse, rsq)
)
## Results
knn_results |> collect_metrics()# A tibble: 20 × 7
neighbors .metric .estimator mean n std_err .config
<int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 rmse standard 3.05 50 0.107 Preprocessor1_Model01
2 1 rsq standard 0.747 50 0.0145 Preprocessor1_Model01
3 11 rmse standard 3.07 50 0.0783 Preprocessor1_Model02
4 11 rsq standard 0.730 50 0.0116 Preprocessor1_Model02
5 22 rmse standard 3.35 50 0.0672 Preprocessor1_Model03
6 22 rsq standard 0.687 50 0.0100 Preprocessor1_Model03
7 33 rmse standard 3.58 50 0.0658 Preprocessor1_Model04
8 33 rsq standard 0.649 50 0.00946 Preprocessor1_Model04
9 44 rmse standard 3.77 50 0.0669 Preprocessor1_Model05
10 44 rsq standard 0.614 50 0.00933 Preprocessor1_Model05
11 55 rmse standard 3.91 50 0.0682 Preprocessor1_Model06
12 55 rsq standard 0.584 50 0.00943 Preprocessor1_Model06
13 66 rmse standard 4.03 50 0.0696 Preprocessor1_Model07
14 66 rsq standard 0.560 50 0.00957 Preprocessor1_Model07
15 77 rmse standard 4.12 50 0.0709 Preprocessor1_Model08
16 77 rsq standard 0.540 50 0.00970 Preprocessor1_Model08
17 88 rmse standard 4.20 50 0.0720 Preprocessor1_Model09
18 88 rsq standard 0.523 50 0.00981 Preprocessor1_Model09
19 99 rmse standard 4.26 50 0.0730 Preprocessor1_Model10
20 99 rsq standard 0.509 50 0.00989 Preprocessor1_Model10
knn_results |> show_best(metric = "rmse")# A tibble: 5 × 7
neighbors .metric .estimator mean n std_err .config
<int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 rmse standard 3.05 50 0.107 Preprocessor1_Model01
2 11 rmse standard 3.07 50 0.0783 Preprocessor1_Model02
3 22 rmse standard 3.35 50 0.0672 Preprocessor1_Model03
4 33 rmse standard 3.58 50 0.0658 Preprocessor1_Model04
5 44 rmse standard 3.77 50 0.0669 Preprocessor1_Model05
knn_results |> autoplot()## Select best
best_k <- select_best(knn_results, metric = "rmse")
final_knn_wf <- knn_workflow |> finalize_workflow(best_k)## Compare all models
tibble(
Model = c("OLS", "Ridge", "LASSO", "Random Forest", "KNN"),
Error = c(
collect_metrics(lm_results) |> filter(.metric == "rmse") |> pull(mean),
collect_metrics(ridge_res) |> filter(.metric == "rmse") |> inner_join(best_ridge, by = "penalty") |> pull(mean),
collect_metrics(lasso_res) |> filter(.metric == "rmse") |> inner_join(best_lasso, by = "penalty") |> pull(mean),
collect_metrics(rf_tuned) |> filter(.metric == "rmse") |> slice_min(mean) |> pull(mean),
collect_metrics(knn_results) |> filter(.metric == "rmse") |> slice_min(mean) |> pull(mean)
)
) |>
arrange(Error)# A tibble: 5 × 2
Model Error
<chr> <dbl>
1 KNN 3.05
2 Random Forest 3.41
3 LASSO 6.46
4 Ridge 6.46
5 OLS 6.47
tibble(
Model = c("OLS", "Ridge", "LASSO", "Random Forest", "KNN"),
Error = c(
collect_metrics(lm_results) |> filter(.metric == "rmse") |> pull(mean),
collect_metrics(ridge_res) |> filter(.metric == "rmse") |> inner_join(best_ridge, by = "penalty") |> pull(mean),
collect_metrics(lasso_res) |> filter(.metric == "rmse") |> inner_join(best_lasso, by = "penalty") |> pull(mean),
collect_metrics(rf_tuned) |> filter(.metric == "rmse") |> slice_min(mean) |> pull(mean),
collect_metrics(knn_results) |> filter(.metric == "rmse") |> slice_min(mean) |> pull(mean)
)
) |>
ggplot(aes(x = Error, y = reorder(Model, -Error))) +
geom_point(size = 4, color = "steelblue") +
geom_segment(aes(x = 0, xend = Error, y = Model, yend = Model), color = "gray70") +
labs(title = "Model Performance",
subtitle = "Left is better",
x = "Prediction Error",
y = NULL) +
theme_minimal()The best model is Random Forest (lowest error: 3.06). Now we test it on completely new data.
## Create new split for Random Forest (uses clean data)
set.seed(20251211)
rf_split <- initial_split(remit_train_clean, prop = 0.8)
## Test Random Forest
test_results <- final_rf_wf |> last_fit(rf_split)
## Show results
test_results |> collect_metrics()# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 2.68 Preprocessor1_Model1
2 rsq standard 0.860 Preprocessor1_Model1
On average, predictions are off by 2.53 percentage points. The model explains 86% of the variation in remittances
## Visualize: Actual vs Predicted
test_results |>
collect_predictions() |>
ggplot(aes(x = remittances_gdp, y = .pred)) +
geom_point(alpha = 0.5, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Random Forest: Test Set Performance",
subtitle = "Points near line = good predictions",
x = "Actual Remittances (% GDP)",
y = "Predicted"
) +
theme_minimal()Random Forest performs much better than the other models!
Earlier in the modeling process we explored 6 countries of interest, that represent either increasingly important remittance receiving countries, who would most be impact be changes in US migration policy (deportations, overall changes in migrant stock, etc) or who have throughout the duration of our model been important players.
We want to now apply to these 6 (plus mexico) to see, on average, how well our model responds to the predictors in particular for countries in Latin America (and India) who have widely been cited as central components of contemporary migration/ foreign development conversations.
# Collect predictions from last_fit or resamples
country_preds <- test_results |>
collect_predictions() |>
left_join(remit_train |>
mutate(row_id = row_number()) |>
select(row_id, country_name, year),
by = c(".row" = "row_id"))
# Respecifying our 5 countries of interest and their predictions
countries_of_interest <- c("Nicaragua", "El Salvador", "Honduras",
"Guatemala", "Haiti", "India", "Mexico")
rmse_tbl <- country_preds |>
filter(country_name %in% countries_of_interest) |>
group_by(country_name) |>
summarise(
Error = rmse_vec(truth = remittances_gdp, estimate = .pred),
Avg_Remittances_GDP = mean(remittances_gdp, na.rm = TRUE),
.groups = "drop" ) |>
arrange(Error)
rmse_tbl# A tibble: 7 × 3
country_name Error Avg_Remittances_GDP
<chr> <dbl> <dbl>
1 India 0.344 2.89
2 Nicaragua 0.980 8.26
3 Haiti 1.18 12.6
4 Mexico 1.36 2.26
5 Honduras 2.48 15.2
6 El Salvador 3.35 18.7
7 Guatemala 3.65 12.6
Key Insights
# Compute residuals (Predicted - Actual)
residuals_df <- country_preds %>% filter(country_name %in% countries_of_interest) %>%
mutate(residual = .pred - remittances_gdp)
# Plot residuals over time
ggplot(residuals_df, aes(x = year, y = residual)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(alpha = 0.4, color = "steelblue") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
facet_wrap(~country_name, scales = "free_y") +
labs(title = "Residuals Over Time by Country",
x = "Year",
y = "Residual (Predicted - Actual)") +
theme_minimal()Some countries show consistent bias (underprediction in more reliant/ more accurate predictions in relatively less reliant countries on remittances).
Volatility spikes around 2008 and 2020 suggest the model struggles during economic shocks.
India and Mexico show stable residuals, reinforcing their reliability and increasing our ability to make policy recommendations in these context.
Overall, the model’s interpretability would be advised with caution to policymakers in the most-reliant on remittances countries, but may provide use to larger countries (India/ Mexico).
Comment
Cross-validation selected a non-zero penalty for the LASSO model, indicating that it improves predictive performance. The LASSO regularization path shows a small set of predictors entering the model as the penalty decreases, highlighting its role as a variable selection method.